This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.
Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Cmd+Shift+Enter.
##all viruses - target and off target
library(tidyverse)
── Attaching core tidyverse packages ─── tidyverse 2.0.0 ──
✔ dplyr 1.1.4 ✔ readr 2.1.5
✔ forcats 1.0.0 ✔ stringr 1.5.1
✔ ggplot2 3.5.1 ✔ tibble 3.2.1
✔ lubridate 1.9.4 ✔ tidyr 1.3.1
✔ purrr 1.0.2 ── Conflicts ───────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library(ggpubr)
library(tidyplots)
Attaching package: ‘tidyplots’
The following object is masked from ‘package:ggpubr’:
gene_expression
library(ggthemes)
library(scales)
Attaching package: ‘scales’
The following object is masked from ‘package:purrr’:
discard
The following object is masked from ‘package:readr’:
col_factor
#Linear model for TE data - separated by virus
#November 2024
#viral read depth
#use bowtie2 data
library(tidyverse)
library(broom)
library(boot)
####read counts/normalised read counts####
#metadata
metadata <-
read.csv(
"metadata/sampleIDs_TESpikeIn.csv",
header = TRUE
)
metadata2 <- metadata |>
select(Sample.ID, Number.of.read.pairs..quality.adaptor.trimmed.) |>
rename(Sample_id = Sample.ID) |>
rename(QC_reads = Number.of.read.pairs..quality.adaptor.trimmed.)
#import and combine read count files
#deduplicated and non-deduplicated
counts_dedup_bt <-
read.table(
"data_TE/TE_sequencing_experiment_readcount_per_sample_and_virus_bowtie2_dedup_atcc_ref.tsv",
sep = "\t",
header = TRUE
)
counts_nodedup_bt <-
read.table(
"data_TE/TE_sequencing_experiment_readcount_per_sample_and_virus_bowtie2_nodedup_atcc_ref.tsv",
sep = "\t",
header = TRUE
)
counts_bt_all <- rbind(counts_dedup_bt, counts_nodedup_bt)
counts_reads <- left_join(counts_bt_all, metadata2, by = "Sample_id")
####read depths####
#import and combine read depth files
depth_dedup_bt <-
read.table(
"data_TE/TE_sequencing_experiment_readdepth_per_sample_and_virus_bowtie2_dedup_atcc_ref.tsv",
sep = "\t",
header = TRUE
)
depth_nodedup_bt <-
read.table(
"data_TE/TE_sequencing_experiment_readdepth_per_sample_and_virus_bowtie2_nodedup_atcc_ref.tsv",
sep = "\t",
header = TRUE
)
depths_bt_all <- rbind(depth_dedup_bt, depth_nodedup_bt)
depths_reads <- left_join(depths_bt_all, metadata2, by = "Sample_id") |>
mutate(genome_structure = case_when((
virus == "Human_adenovirus_40" |
virus == "Human_betaherpesvirus"
) ~ "DNA",
,
.default = "RNA"
))
####linear model for spike in viruses - mean read depth####
depths_reads_sub <- depths_reads |>
filter(Background != "p6") |>
filter(Background != "control") |>
filter(type == "dedup_TE") |>
mutate(log_depth = log10(mean_depth)) |>
mutate(log_viral_load = log10(Viral.load))
#inspect data
hist(depths_reads_sub$mean_depth)
hist(depths_reads_sub$Viral.load)
hist(depths_reads_sub$log_depth)
hist(depths_reads_sub$log_viral_load)
summary(depths_reads_sub$log_depth)
summary(depths_reads_sub$log_viral_load)
#all viruses together
lm1 <-
glm(log_depth ~ log_viral_load * virus,
data = depths_reads_sub,
family = "gaussian")
summary(lm1)
glm.diag.plots(lm1)
pred <- predict(lm1, type = "response")
rsq <- function (x, y) {
cor(x, y) ^ 2
}
rsq(depths_reads_sub$log_depth, pred)
##split by viruses
cols <- c("#4477AA",
"#66CCEE",
"#228833",
"#CCBB44",
"#EE6677",
"#AA3377")
facet_names <- c(
"Human_adenovirus_40" = "Human adenovirus 40",
"Human_betaherpesvirus" = "Human betaherpesvirus",
"Human_respiratory_syncytial_virus" = "Human respiratory syncytial virus",
"Influenza_B_virus" = "Influenza B virus",
"Mammalian_orthoreovirus3" = "Mammalian orthoreovirus 3",
"Zika_virus" = "Zika virus"
)
virus <- (unique(depths_reads_sub$virus)) %>%
rep(., each = 2) %>%
data.frame()
list_models <-
depths_reads_sub |>
group_split(virus) |>
map( ~ lm(log_depth ~ log_viral_load, data = .))
lm_tidy <-
map(list_models, broom::tidy) %>%
do.call(rbind.data.frame, .) %>%
cbind(virus, .) %>%
rename(virus = ".") %>%
select(virus, term, estimate) %>%
pivot_wider(names_from = "term", values_from = "estimate")
lm_summary <- depths_reads_sub |>
group_by(virus) |>
summarise(
Intercept = lm(log_depth ~ log_viral_load)$coefficients[1],
Coeff_x1 = lm(log_depth ~ log_viral_load)$coefficients[2],
R2 = summary(lm(log_depth ~ log_viral_load))$r.squared,
pvalue = summary(lm(log_depth ~ log_viral_load))$coefficients["log_viral_load", 4]
)
lm_combined <-
left_join(lm_tidy, lm_summary, by = "virus")
#ggsave("figures/compare_spike_ins_atcc/model_readdepth_dedup.png")
ggsave("figures/manuscript_figure_2025/PDF/Figure_1.pdf",
width = 8,
height = 5)
ggsave("figures/manuscript_figure_2025/PNG/Figure_1.png",
width = 8,
height = 5)
### Read-in data
#ATCC genomes updated version
#October 2024
#viral load vs read count normalised using different methods, comparing deduplicated and non-deduplicated
#use bowtie2 data
####read counts/normalised read counts####
#metadata
metadata <- read.csv("metadata/sampleIDs_TESpikeIn.csv", header = TRUE)
metadata2 <- metadata |>
select(Sample.ID, Number.of.read.pairs..quality.adaptor.trimmed.) |>
rename(Sample_id = Sample.ID) |>
rename(QC_reads = Number.of.read.pairs..quality.adaptor.trimmed.)
#import and combine read count files
#deduplicated and non-deduplicated
counts_dedup_bt <-
read.table(
"data_TE/TE_sequencing_experiment_readcount_per_sample_and_virus_bowtie2_dedup_atcc_ref.tsv",
sep = "\t",
header = TRUE
)
counts_nodedup_bt <-
read.table(
"data_TE/TE_sequencing_experiment_readcount_per_sample_and_virus_bowtie2_nodedup_atcc_ref.tsv",
sep = "\t",
header = TRUE
)
counts_bt_all <- rbind(counts_dedup_bt, counts_nodedup_bt)
counts_reads <- left_join(counts_bt_all, metadata2, by = "Sample_id")
#normalise by both raw read count and genome length - should be the same as mean read depth
counts_reads_norm <- counts_reads |>
mutate(norm_counts1 = matched / QC_reads) |>
mutate(norm_counts2 = matched / QC_reads / length) |>
mutate(norm_counts3 = matched / length) |>
mutate(genome_structure = case_when((
virus == "Human_adenovirus_40" |
virus == "Human_betaherpesvirus"
) ~ "DNA",
.default = "RNA"
))
####read depths####
#import and combine read depth files
depth_dedup_bt <-
read_tsv(
"data_TE/TE_sequencing_experiment_readdepth_per_sample_and_virus_bowtie2_dedup_atcc_ref.tsv"
)
Rows: 95 Columns: 12── Column specification ───────────────────────────────────
Delimiter: "\t"
chr (5): virus, Sample_id, Background, mapper, type
dbl (7): Viral load, mean_depth, median_depth, min, max...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
depth_nodedup_bt <-
read_tsv(
"data_TE/TE_sequencing_experiment_readdepth_per_sample_and_virus_bowtie2_nodedup_atcc_ref.tsv"
)
Rows: 95 Columns: 12── Column specification ───────────────────────────────────
Delimiter: "\t"
chr (5): virus, Sample_id, Background, mapper, type
dbl (7): Viral load, mean_depth, median_depth, min, max...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
depths_bt_all <- rbind(depth_dedup_bt, depth_nodedup_bt)
depths_reads <-
depths_bt_all |>
left_join(metadata2, by = "Sample_id") |>
mutate(genome_structure = case_when((
virus == "Human_adenovirus_40" |
virus == "Human_betaherpesvirus"
) ~ "DNA",
.default = "RNA"
)) |>
rename(Viral.load = `Viral load`)
#change labels in facet plots
facet_names <- c("dedup_TE" = "Deduplicated",
"nodedup_TE" = "Non-Deduplicated")
cols <- c("#4477AA",
"#66CCEE",
"#228833",
"#CCBB44",
"#EE6677",
"#AA3377")
#plot viral read counts (non normalised)
read_count <-
counts_reads_norm |>
filter(Background != "p6") |>
filter(Background != "control") |>
ggplot(aes(
x = Viral.load,
y = log10(matched),
colour = virus
)) +
geom_point() +
geom_smooth(aes(group = virus, linetype = genome_structure), se = FALSE) +
facet_grid( ~ type, labeller = as_labeller(facet_names)) +
theme_few() +
theme(
axis.title.x = element_text(size = 12),
axis.text.x = element_text(angle = 0, hjust = 1),
axis.title.y = element_text(size = 12),
legend.title = element_blank(),
legend.position = "top",
panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
) +
scale_color_manual(
values = cols,
labels = c(
"Human adenovirus 40",
"Human betaherpesvirus",
"Human respiratory syncytial virus",
"Influenza B virus",
"Mammalian orthoreovirus 3",
"Zika virus"
)
) +
scale_linetype_manual(values=c("dotted", "solid")) +
ylab("Log (Viral Reads)") +
xlab("Genome copies") +
scale_x_log10(labels = scales::trans_format("log10", scales::label_math()))
read_count_norm1 <-
counts_reads_norm |>
filter(Background != "p6") |>
filter(Background != "control") |>
ggplot(aes(
x = Viral.load,
y = log10(norm_counts1),
colour = virus
)) +
geom_point() +
geom_smooth(aes(group = virus, linetype = genome_structure), se = FALSE) +
facet_grid( ~ type, labeller = as_labeller(facet_names)) +
theme_few() +
theme(
axis.title.x = element_text(size = 12),
axis.text.x = element_text(angle = 0, hjust = 1),
axis.title.y = element_text(size = 12),
legend.title = element_blank(),
legend.position = "top",
panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
) +
scale_color_manual(
values = cols,
labels = c(
"Human adenovirus 40",
"Human betaherpesvirus",
"Human respiratory syncytial virus",
"Influenza B virus",
"Mammalian orthoreovirus 3",
"Zika virus"
)
) +
scale_linetype_manual(values=c("dotted", "solid")) +
ylab("Log (viral reads/cleaned reads)") +
xlab("Genome copies") +
scale_x_log10(labels = scales::trans_format("log10", scales::label_math()))
#normalise by raw read count and genome length
read_count_norm2 <-
counts_reads_norm |>
filter(Background != "p6") |>
filter(Background != "control") |>
ggplot(aes(
x = (Viral.load),
y = log10(norm_counts2),
colour = virus
)) +
geom_point() +
geom_smooth(aes(group = virus, linetype = genome_structure), se = FALSE) +
facet_grid( ~ type, labeller = as_labeller(facet_names)) +
theme_few() +
theme(
axis.title.x = element_text(size = 12),
axis.text.x = element_text(angle = 0, hjust = 1),
axis.title.y = element_text(size = 12),
legend.title = element_blank(),
legend.position = "top",
panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
) +
scale_color_manual(
values = cols,
labels = c(
"Human adenovirus 40",
"Human betaherpesvirus",
"Human respiratory syncytial virus",
"Influenza B virus",
"Mammalian orthoreovirus 3",
"Zika virus"
)
) +
scale_linetype_manual(values=c("dotted", "solid")) +
ylab("Log (viral reads/cleaned reads/genome length)") +
xlab("Genome copies") +
scale_x_log10(labels = scales::trans_format("log10", scales::label_math()))
read_depths <-
depths_reads |>
filter(Background != "p6") |>
filter(Background != "control") |>
ggplot(aes(
x = Viral.load,
y = log10(mean_depth),
colour = virus
)) +
geom_point() +
geom_smooth(aes(group = virus, linetype = genome_structure), se = FALSE) +
facet_grid( ~ type, labeller = as_labeller(facet_names)) +
theme_few() +
theme(
axis.title.x = element_text(size = 12),
axis.text.x = element_text(angle = 0, hjust = 1),
axis.title.y = element_text(size = 12),
legend.title = element_blank(),
legend.position = "top",
panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
) +
scale_color_manual(
values = cols,
labels = c(
"Human adenovirus 40",
"Human betaherpesvirus",
"Human respiratory syncytial virus",
"Influenza B virus",
"Mammalian orthoreovirus 3",
"Zika virus"
)
) +
<<<<<<< HEAD
Error: unexpected input in:
" ) +
<<"
#metadata
metadata <-
read.csv(
"metadata/sampleIDs_TESpikeIn.csv",
header = TRUE
)
metadata2 <- metadata |>
select(Sample.ID, Number.of.read.pairs..quality.adaptor.trimmed.) |>
rename(Sample_id = Sample.ID) |>
rename(QC_reads = Number.of.read.pairs..quality.adaptor.trimmed.)
#import and combine read count files
#deduplicated and non-deduplicated
counts_dedup_bt <-
read.table(
"data_TE/TE_sequencing_experiment_readcount_per_sample_and_virus_bowtie2_dedup_atcc_ref.tsv",
sep = "\t",
header = TRUE
)
counts_nodedup_bt <-
read.table(
"data_TE/TE_sequencing_experiment_readcount_per_sample_and_virus_bowtie2_nodedup_atcc_ref.tsv",
sep = "\t",
header = TRUE
)
counts_bt_all <- rbind(counts_dedup_bt, counts_nodedup_bt)
counts_reads <- left_join(counts_bt_all, metadata2, by = "Sample_id")
#normalise by both raw read count and genome length - should be the same as mean read depth
counts_reads_norm <- counts_reads |>
mutate(norm_counts1 = matched / QC_reads) |>
mutate(norm_counts2 = matched / QC_reads / length) |>
mutate(norm_counts3 = matched / length) |>
mutate(genome_structure = case_when((
virus == "Human_adenovirus_40" |
virus == "Human_betaherpesvirus"
) ~ "DNA",
.default = "RNA"
))
####compare library capture pool####
cols3 <- c("#228833", "#AA3377")
facet_names <- c("dedup_TE" = "Deduplicated",
"nodedup_TE" = "Non-Deduplicated")
metadata3 <-
metadata |>
select(Sample.ID, Pool.for.sequencing) |>
rename(Sample_id = Sample.ID) |>
rename(Pool = Pool.for.sequencing)
counts_pool <- left_join(counts_reads_norm, metadata3, by = "Sample_id")
pool_counts_sum <-
counts_pool |>
filter(Background != "p6") |>
filter(Background != "control") |>
ggplot(aes(
x = as.character(Viral.load),
y = log10(matched),
colour = Pool
)) +
geom_boxplot() +
facet_grid( ~ type, labeller = as_labeller(facet_names)) +
theme_few() +
theme(
# axis.title.x = element_blank(),
# axis.text.x = element_blank(),
legend.title = element_blank(),
axis.title.y = element_text(size = 10),
panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
) +
ylab("Log (Viral Reads)") + xlab("Genome copies") +
scale_color_manual(values = cols3, labels = c("Pool 1", "Pool 2")) +
scale_x_discrete(labels = c(bquote(10^{2}), bquote(10^{3}), bquote(10^{5})))
# pool_readcounts_norm <-
# counts_pool |>
# filter(Background != "p6") |>
# filter(Background != "control") |>
# ggplot(aes(x = virus, y = log10(norm_counts1))) +
# geom_boxplot() +
# facet_grid(Pool ~ type) +
# theme_few() +
# theme(
# axis.title.x = element_blank(),
# # axis.text.x = element_blank(),
# axis.title.y = element_text(size = 10),
# panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
# panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
# ) +
# ylab("log10(viral reads/cleaned reads)")
pool_norm1_sum <-
counts_pool |>
filter(Background != "p6") |>
filter(Background != "control") |>
ggplot(aes(
x = as.character(Viral.load),
y = log10(matched),
colour = Pool
)) +
geom_boxplot() +
facet_grid( ~ type, labeller = as_labeller(facet_names)) +
theme_few() +
theme(
# axis.title.x = element_blank(),
# axis.text.x = element_blank(),
legend.title = element_blank(),
axis.title.y = element_text(size = 10),
panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
) +
ylab("Log (viral reads/cleaned reads)") + xlab("Genome copies") +
scale_color_manual(values = cols3, labels = c("Pool 1", "Pool 2")) +
scale_x_discrete(labels = c(bquote(10^{2}), bquote(10^{3}), bquote(10^{5})))
# pool_readcounts_norm2 <-
# counts_pool |>
# filter(Background != "p6") |>
# filter(Background != "control") |>
# ggplot(aes(x = virus, y = log10(norm_counts2))) +
# geom_boxplot() +
# facet_grid(Pool ~ type) +
# theme_few() +
# theme(
# axis.title.x = element_blank(),
# # axis.text.x = element_blank(),
# axis.title.y = element_text(size = 10),
# panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
# panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
# ) +
# ylab("log10(viral reads/cleaned reads/genome length)")
pool_norm2_sum <-
counts_pool |>
filter(Background != "p6") |>
filter(Background != "control") |>
ggplot(aes(
x = as.character(Viral.load),
y = log10(matched),
colour = Pool
)) +
geom_boxplot() +
facet_grid( ~ type, labeller = as_labeller(facet_names)) +
theme_few() +
theme(
# axis.title.x = element_blank(),
# axis.text.x = element_blank(),
legend.title = element_blank(),
axis.title.y = element_text(size = 10),
panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
) +
ylab("Log (viral reads/cleaned reads/genome length)") + xlab("Genome copies") +
scale_color_manual(values = cols3, labels = c("Pool 1", "Pool 2")) +
scale_x_discrete(labels = c(bquote(10^{2}), bquote(10^{3}), bquote(10^{5})))
#import and combine read depth files
depth_dedup_bt <-
read.table(
"data_TE/TE_sequencing_experiment_readdepth_per_sample_and_virus_bowtie2_dedup_atcc_ref.tsv",
sep = "\t",
header = TRUE
)
depth_nodedup_bt <-
read.table(
"data_TE/TE_sequencing_experiment_readdepth_per_sample_and_virus_bowtie2_nodedup_atcc_ref.tsv",
sep = "\t",
header = TRUE
)
depths_bt_all <- rbind(depth_dedup_bt, depth_nodedup_bt)
depths_reads <-
left_join(depths_bt_all, metadata2, by = "Sample_id") |>
mutate(genome_structure = case_when((
virus == "Human_adenovirus_40" |
virus == "Human_betaherpesvirus"
) ~ "DNA",
.default = "RNA"
))
depths_pool <- left_join(depths_reads, metadata3, by = "Sample_id")
# pool_readdepths <-
# depths_pool |>
# filter(Background != "p6") |>
# filter(Background != "control") |>
# ggplot(aes(x = virus, y = log10(mean_depth))) +
# geom_boxplot() +
# facet_grid(Pool ~ type) +
# theme_few() +
# theme(
# axis.title.x = element_blank(),
# axis.text.x = element_text(angle = 45, hjust = 1),
# axis.title.y = element_text(size = 10),
# panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
# panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
# ) +
# ylab("Log (mean read depth)")
pool_depths_sum <-
depths_pool |>
filter(Background != "p6") |>
filter(Background != "control") |>
ggplot(aes(
x = as.character(Viral.load),
y = log10(mean_depth),
colour = Pool
)) +
geom_boxplot() +
facet_grid( ~ type, labeller = as_labeller(facet_names)) +
theme_few() +
theme(
# axis.title.x = element_blank(),
# axis.text.x = element_blank(),
legend.title = element_blank(),
axis.title.y = element_text(size = 10),
panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
) +
ylab("Log (mean read depth)") +
xlab("Genome copies") +
scale_x_discrete(labels = c(bquote(10^{2}), bquote(10^{3}), bquote(10^{5}))) +
scale_color_manual(values = cols3, labels = c("Pool 1", "Pool 2"))
ggarrange(
pool_counts_sum,
pool_norm1_sum,
pool_norm2_sum,
pool_depths_sum,
nrow = 2,
ncol = 2,
common.legend = TRUE,
align = "hv"
)
#ggsave("figures/compare_spike_ins_atcc/pool_compare_viruses_sum.png",width=10,height=7)
ggsave("figures/manuscript_figure_2025/PDF/Figure_S2.pdf",
width = 12,
height = 8)
ggsave("figures/manuscript_figure_2025/PNG/Figure_S2.png",
width = 12,
height = 8)
##plots of read counts and viral read counts
#November 2024
#metadata
metadata <-
read.csv(
"metadata/sampleIDs_TESpikeIn.csv",
header = TRUE
)
metadata2 <- metadata |>
select(Sample.ID, Number.of.read.pairs..quality.adaptor.trimmed.) |>
rename(Sample_id = Sample.ID) |>
rename(QC_reads = Number.of.read.pairs..quality.adaptor.trimmed.)
#import read count files
counts_dedup_bt <-
read.table(
"data_TE/TE_sequencing_experiment_readcount_per_sample_and_virus_bowtie2_dedup_atcc_ref.tsv",
sep = "\t",
header = TRUE
)
counts_nodedup_bt <-
read.table(
"data_TE/TE_sequencing_experiment_readcount_per_sample_and_virus_bowtie2_nodedup_atcc_ref.tsv",
sep = "\t",
header = TRUE
)
#import viral reads mapped (to calculate proportions)
viral_reads_dedup <-
read.csv(
"data_TE/total_virus_mapped_reads_per_sample_dedup_atcc_ref_20241108.csv",
header = TRUE
)
viral_reads_nodedup <-
read.csv(
"data_TE/total_virus_mapped_reads_per_sample_nodedup_atcc_ref_20241108.csv",
header = TRUE
)
cols2 <- c("#BB5566", "#004488")
facet_names <- c("dedup_TE" = "Deduplicated",
"nodedup_TE" = "Non-Deduplicated")
reads_metadata_dedup <-
left_join(counts_dedup_bt, metadata2, by = "Sample_id")
reads_viral_dedup <-
left_join(reads_metadata_dedup, viral_reads_dedup, by = "Sample_id")
reads_metadata_nodedup <-
left_join(counts_nodedup_bt, metadata2, by = "Sample_id")
reads_viral_nodedup <-
left_join(reads_metadata_nodedup, viral_reads_nodedup, by = "Sample_id")
reads_viral_all <- rbind(reads_viral_dedup, reads_viral_nodedup)
reads_plot <-
reads_viral_all |>
group_by(Background, Sample_id, Viral.load, type) |>
summarise(
total_reads = (QC_reads * 2),
viral_reads = total_virus_reads,
ATCC_reads = sum(matched),
prop_ATCC = ATCC_reads / total_reads,
prop_viral = total_virus_reads / total_reads,
diff = viral_reads - ATCC_reads
) |>
unique()
####read counts/depths split by background####
#change labels in facet plots
facet_names_bg <- c(
"dedup_TE" = "Deduplicated",
"nodedup_TE" = "Non-Deduplicated",
"p2" = "M1",
"p8" = "M2"
)
#break down by Background x virus
background_counts_reads <-
counts_reads_norm |>
filter(Background != "p6") |>
filter(Background != "control") |>
ggplot(aes(
x = Viral.load,
y = log10(matched),
colour = virus
)) +
geom_point() +
geom_smooth(aes(group = virus, linetype = genome_structure), se = FALSE) +
facet_grid(Background ~ type, labeller = as_labeller(facet_names_bg)) +
theme_few() +
theme(
axis.title.x = element_blank(),
axis.title.y = element_text(size = 10),
panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
) +
scale_color_manual(
values = cols,
labels =
c(
"Human adenovirus 40",
"Human betaherpesvirus",
"Human respiratory syncytial virus",
"Influenza B virus",
"Mammalian orthoreovirus 3",
"Zika virus"
)
) +
scale_x_log10(labels = scales::label_log10()) +
ylab("Log (Viral Reads)")
backgrounds_counts_reads_norm <-
counts_reads_norm |>
filter(Background != "p6") |>
filter(Background != "control") |>
ggplot(aes(
x = Viral.load,
y = log10(norm_counts1),
colour = virus
)) +
geom_point() +
geom_smooth(aes(group = virus, linetype = genome_structure), se = FALSE) +
facet_grid(Background ~ type, labeller = as_labeller(facet_names_bg)) +
theme_bw() +
theme_few() +
theme(
axis.title.x = element_blank(),
axis.title.y = element_text(size = 10),
panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
) +
scale_color_manual(
values = cols,
labels = c(
"Human adenovirus 40",
"Human betaherpesvirus",
"Human respiratory syncytial virus",
"Influenza B virus",
"Mammalian orthoreovirus 3",
"Zika virus"
)
) +
scale_x_log10(label = scales::label_log10()) +
ylab("Log (viral reads/cleaned reads)")
backgrounds_counts_reads_norm2 <-
counts_reads_norm |>
filter(Background != "p6") |>
filter(Background != "control") |>
ggplot(aes(
x = Viral.load,
y = log10(norm_counts2),
colour = virus
)) +
geom_point() +
geom_smooth(aes(group = virus, linetype = genome_structure), se = FALSE) +
facet_grid(Background ~ type, labeller = as_labeller(facet_names_bg)) +
theme_few() +
theme(
axis.title.x = element_blank(),
axis.title.y = element_text(size = 10),
panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
) +
scale_color_manual(
values = cols,
labels = c(
"Human adenovirus 40",
"Human betaherpesvirus",
"Human respiratory syncytial virus",
"Influenza B virus",
"Mammalian orthoreovirus 3",
"Zika virus"
)
) +
scale_x_log10(labels = scales::label_log10()) +
ylab("Log (viral reads/cleaned reads/genome length)")
background_read_depths <-
depths_reads |>
filter(Background != "p6") |>
filter(Background != "control") |>
ggplot(aes(
x = Viral.load,
y = log10(mean_depth),
colour = virus
)) +
geom_point() +
geom_smooth(aes(group = virus, linetype = genome_structure), se = FALSE) +
facet_grid(Background ~ type, labeller = as_labeller(facet_names_bg)) +
theme_few() +
theme(
axis.title.x = element_blank(),
panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
panel.grid.minor = element_line(linewidth=0.1, color = "lightgray"),
axis.title.y = element_text(size = 10),
legend.title = element_blank()
) +
scale_color_manual(
values = cols,
labels = c(
"Human adenovirus 40",
"Human betaherpesvirus",
"Human respiratory syncytial virus",
"Influenza B virus",
"Mammalian orthoreovirus 3",
"Zika virus"
)
) +
scale_x_log10(label = scales::label_log10()) +
ylab("Log (mean read depth)")
counts_nodedup_bt <-
read_tsv(
"data_TE/TE_sequencing_experiment_readcount_per_sample_and_virus_bowtie2_nodedup_atcc_ref.tsv"
)
Rows: 96 Columns: 9── Column specification ───────────────────────────────────
Delimiter: "\t"
chr (5): virus, Sample_id, Background, mapper, type
dbl (4): length, matched, unmatched, Viral load
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
persite_coverage_nodedup <-
nodedup_per_site |>
rename(Viral.load = `Viral load`) |>
group_by(Sample_id, virus, Viral.load, Background, type) |>
filter(coverage > 0) |>
summarise(genome_coverage = n()) |>
left_join(lengths) |>
mutate(percent_coverage = genome_coverage / length)
`summarise()` has grouped output by 'Sample_id', 'virus', 'Viral.load', 'Background'. You can override using the `.groups` argument.Joining with `by = join_by(virus)`
persite_both <- rbind(dedup_per_site, nodedup_per_site)
persite_reads <- left_join(persite_both, metadata2, by = "Sample_id")
persite_norm <- persite_reads |>
mutate(norm_cov = coverage / QC_reads)
coverage_labels3 <-c("100" = "1e+02",
"1000" = "1e+03",
"100000" = "1e+05",
"p2" = "M1",
"p8"= "M2")
persite_norm$`Viral load` <-
factor(persite_norm$`Viral load`,
labels = c("0",
"10^{2}",
"10^{3}",
"10^{5}"))
persite_norm |>
filter(Background != "p6") |>
filter(Background != "control") |>
filter(type == "dedup_TE") |>
filter(virus == "Human_respiratory_syncytial_virus") |>
rename(Viral.load = `Viral load`) |>
ggplot(aes(x = site, y = coverage, fill = Background)) +
geom_col() +
facet_wrap(Viral.load ~ Sample_id, label = label_parsed) +
ylab("Coverage") +
ggtitle("Human RSV (deduplicated)") +
guides(fill = guide_legend(title = "Background")) +
theme_few() +
theme(
axis.title.x = element_text(size = 12),
axis.text.x = element_text(angle = 30, hjust = 1),
axis.title.y = element_text(size = 12),
legend.title = element_blank(),
legend.position = "top",
panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
) +
scale_y_log10() +
scale_fill_manual(values = cols2, labels = c("M1", "M2"))
ggsave("figures/manuscript_figure_2025/PDF/Figure_S4.pdf",
width=15,
height=10)
ggsave("figures/manuscript_figure_2025/PNG/Figure_S4.png",
width=15,
height=10)
####Final plot
ggsave("figures/manuscript_figure_2025/PDF/Figure_S5.pdf",
width=15,
height=10)
ggsave("figures/manuscript_figure_2025/PNG/Figure_S5.png",
width=15,
height=10)
####Final plot
ggsave("figures/manuscript_figure_2025/PDF/Figure_S6.pdf",
width=15,
height=10)
ggsave("figures/manuscript_figure_2025/PNG/Figure_S6.png",
width=15,
height=10)
HBV <-
persite_norm |>
filter(Background != "p6") |>
filter(Background != "control") |>
filter(type == "dedup_TE") |>
filter(virus == "Human_betaherpesvirus") |>
rename(Viral.load = `Viral load`) |>
ggplot(aes(x = site, y = coverage, fill = Background)) +
geom_col() +
facet_wrap(Viral.load ~ Sample_id, label = label_parsed) +
ylab("Coverage") +
ggtitle("Human Betaherpesvirus (deduplicated)") +
guides(fill = guide_legend(title = "Background")) +
theme_few() +
theme(
axis.title.x = element_text(size = 12),
axis.text.x = element_text(angle = 30, hjust = 1),
axis.title.y = element_text(size = 12),
legend.title = element_blank(),
legend.position = "top",
panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
) +
scale_y_log10() +
scale_fill_manual(values = cols2, labels = c("M1", "M2"))
####Final plot
ggsave(plot = HBV, file="figures/manuscript_figure_2025/PDF/Figure_S7.pdf",
width=15,
height=10,
dpi = 600)
ggsave(plot = HBV, file="figures/manuscript_figure_2025/PNG/Figure_S7.png",
width=15,
height=10)
#have to do these differently due to segmentation
FLU_ME1 <- persite_norm |>
filter(virus == "Influenza_B_virus") |>
filter(type == "dedup_TE") |>
filter(Background == "p2") |>
rename(Viral.load = `Viral load`) |>
ggplot(aes(
x = site,
y = coverage,
fill = as.character(Viral.load)
)) +
geom_col() +
facet_grid(seg ~ Sample_id) +
ylab("Coverage") +
ggtitle("Influenza B virus (Background M1 deduplicated)") +
guides(fill = guide_legend(title = "Viral load")) +
theme_few() +
theme(
axis.title.x = element_text(size = 12),
axis.text.x = element_text(angle = 30, hjust = 1),
axis.title.y = element_text(size = 12),
legend.title = element_blank(),
legend.position = "top",
panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
) +
scale_y_log10() +
scale_fill_manual(values = cols4, label = c(expression("10"^"2"), expression("10"^"3"), expression("10"^"5")))
FLU_ME2 <- persite_norm |>
filter(virus == "Influenza_B_virus") |>
filter(type == "dedup_TE") |>
filter(Background == "p2") |>
rename(Viral.load = `Viral load`) |>
ggplot(aes(
x = site,
y = coverage,
fill = as.character(Viral.load)
)) +
geom_col() +
facet_grid(seg ~ Sample_id) +
ylab("Coverage") +
ggtitle("Influenza B virus (Background M2 deduplicated)") +
guides(fill = guide_legend(title = "Viral load")) +
theme_few() +
theme(
axis.title.x = element_text(size = 12),
axis.text.x = element_text(angle = 30, hjust = 1),
axis.title.y = element_text(size = 12),
legend.title = element_blank(),
legend.position = "top",
panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
) +
scale_y_log10() +
scale_fill_manual(values = cols4, label = c(expression("10"^"2"), expression("10"^"3"), expression("10"^"5")))
ggarrange(FLU_ME1, FLU_ME2, ncol = 2, common.legend = TRUE)
Warning: log-10 transformation introduced infinite values.Warning: Removed 3867 rows containing missing values or values
outside the scale range (`geom_col()`).Warning: log-10 transformation introduced infinite values.Warning: Removed 3867 rows containing missing values or values
outside the scale range (`geom_col()`).Warning: log-10 transformation introduced infinite values.Warning: Removed 3867 rows containing missing values or values
outside the scale range (`geom_col()`).
####Final plot
ggsave("figures/manuscript_figure_2025/PDF/Figure_S8.pdf",
width=20,
height=12,
dpi=600)
ggsave("figures/manuscript_figure_2025/PNG/Figure_S8.png",
width=20,
height=12)
REO_ME1 <-
persite_norm |>
filter(virus == "Mammalian_orthoreovirus3") |>
filter(type == "dedup_TE") |>
filter(Background == "p2") |>
rename(Viral.load = `Viral load`) |>
ggplot(aes(
x = site,
y = coverage,
fill = as.character(Viral.load)
)) +
geom_col() +
facet_grid(seg ~ Sample_id) +
ylab("Coverage") +
ggtitle("Mammalian orthoreovirus 3 (Background M1 deduplicated)") +
guides(fill = guide_legend(title = "Viral load")) +
theme_few() +
theme(
axis.title.x = element_text(size = 12),
axis.text.x = element_text(angle = 30, hjust = 1),
axis.title.y = element_text(size = 12),
legend.title = element_blank(),
legend.position = "top",
panel.grid.major = element_line(linewidth = 0.1, color = "gray60"),
panel.grid.minor = element_line(linewidth = 0.1, color = "lightgray")
) +
scale_y_log10() +
scale_fill_manual(values = cols4,
label = c(
expression("10" ^ "2"),
expression("10" ^ "3"),
expression("10" ^ "5")
))
REO_ME2 <-
persite_norm |>
filter(virus == "Mammalian_orthoreovirus3") |>
filter(type == "dedup_TE") |>
filter(Background == "p8") |>
rename(Viral.load = `Viral load`) |>
ggplot(aes(
x = site,
y = coverage,
fill = as.character(Viral.load)
)) +
geom_col() +
facet_grid(seg ~ Sample_id) +
ylab("Coverage") +
ggtitle("Mammalian orthoreovirus 3 (Background M2 deduplicated)") +
guides(fill = guide_legend(title = "Viral load")) +
theme_few() +
theme(
axis.title.x = element_text(size = 12),
axis.text.x = element_text(angle = 30, hjust = 1),
axis.title.y = element_text(size = 12),
legend.title = element_blank(),
legend.position = "top",
panel.grid.major = element_line(linewidth = 0.1, color = "gray60"),
panel.grid.minor = element_line(linewidth = 0.1, color = "lightgray")
) + scale_y_log10() +
scale_fill_manual(values = cols4,
label = c(
expression("10" ^ "2"),
expression("10" ^ "3"),
expression("10" ^ "5")
))
ggarrange(REO_ME1, REO_ME2, ncol = 2, common.legend = TRUE)
Warning: log-10 transformation introduced infinite values.Warning: Removed 2181 rows containing missing values or values
outside the scale range (`geom_col()`).Warning: log-10 transformation introduced infinite values.Warning: Removed 2181 rows containing missing values or values
outside the scale range (`geom_col()`).Warning: log-10 transformation introduced infinite values.Warning: Removed 934 rows containing missing values or values
outside the scale range (`geom_col()`).
####Final plot
ggsave("figures/manuscript_figure_2025/PDF/Figure_S9.pdf",
width=20,
height=12,
dpi=600)
ggsave("figures/manuscript_figure_2025/PNG/Figure_S9.png",
width=20,
height=12,
dpi=600)
# read in metadata
metadata <-
read.csv("metadata/sampleIDs_TESpikeIn.csv", header = TRUE)
metadata2 <-
metadata |>
select(
Sample.ID,
Background.sample,
Viral.load,
Number.of.read.pairs..quality.adaptor.trimmed.
) |>
rename(Sample_id = Sample.ID) |>
rename(QC_reads = Number.of.read.pairs..quality.adaptor.trimmed.)
#import viral reads mapped (to calculate proportions)
viral_reads_dedup <-
read.csv(
"data_TE/total_virus_mapped_reads_per_sample_dedup_atcc_ref_20241108.csv",
header = TRUE
)
viral_reads_nodedup <-
read.csv(
"data_TE/total_virus_mapped_reads_per_sample_nodedup_atcc_ref_20241108.csv",
header = TRUE
)
#import contingency tables
contingency_dedup <-
read.csv(
"data_TE/contingency_table_mapped_virus_reads_per_family_genus_sample_dedup_atcc_ref_20241106.csv",
header = TRUE
)
contingency_nodedup <-
read.csv(
"data_TE/contingency_table_mapped_virus_reads_per_family_genus_sample_nodedup_atcc_ref_20241106.csv",
header = TRUE
)
contaminants <-
c("Betacoronavirus", "Alphainfluenzavirus", "Gammaretrovirus")
# pivot longer
dedup_long <-
contingency_dedup |>
filter(!genus %in% contaminants) |>
filter(genus != "NA") |>
pivot_longer(cols = A:P,
names_to = "Sample_id",
values_to = "count") |>
mutate(
target = case_when(
genus == "Cyclovirus" ~ "off_target",
genus == "Cardiovirus" ~ "off_target",
genus == "Kobuvirus" ~ "off_target",
.default = "on_target"
)
) |>
mutate(genome_structure = case_when((genus == "Mastadenovirus" |
genus == "Cytomegalovirus") ~ "DNA",
.default = "RNA"
))
no_dedup_long <- contingency_nodedup |>
filter(!genus %in% contaminants) |>
filter(genus != "NA") |>
pivot_longer(cols = A:P,
names_to = "Sample_id",
values_to = "count") |>
mutate(
target = case_when(
genus == "Cyclovirus" ~ "off_target",
genus == "Cardiovirus" ~ "off_target",
genus == "Kobuvirus" ~ "off_target",
.default = "on_target"
)
) |>
mutate(genome_structure = case_when((genus == "Mastadenovirus" |
genus == "Cytomegalovirus") ~ "DNA",
.default = "RNA"
))
metadata_dedup <- left_join(dedup_long, metadata2, by = "Sample_id")
dedup_viral_reads <-
left_join(metadata_dedup, viral_reads_dedup, by = "Sample_id") |>
mutate(total_reads = QC_reads * 2) |>
mutate(prop_total = count / total_reads) |>
mutate(prop_viral = count / total_virus_reads)
metadata_no_dedup <- left_join(no_dedup_long, metadata2, by = "Sample_id")
nodedup_viral_reads <-
left_join(metadata_no_dedup, viral_reads_nodedup, by = "Sample_id") |>
mutate(total_reads = QC_reads * 2) |>
mutate(prop_total = count / total_reads) |>
mutate(prop_viral = count / total_virus_reads)
####read counts/normalised read counts####
#import and combine read count files
gm_readcount_dedup <-
read.table(
"data_genome_medicine/genome_medicine_TE_sequencing_experiment_readcount_per_sample_and_virus_bowtie2_dedup_atcc_ref.tsv",
sep = "\t",
header = TRUE
)
gm_readcount_nodedup <-
read.table(
"data_genome_medicine/genome_medicine_TE_sequencing_experiment_readcount_per_sample_and_virus_bowtie2_nodedup_atcc_ref.tsv",
sep = "\t",
header = TRUE
)
gm_counts_all <- rbind(gm_readcount_dedup, gm_readcount_nodedup)
#import and combine read depth files
depth_dedup_gm <-
read.table(
"data_genome_medicine/genome_medicine_TE_sequencing_experiment_readdepth_per_sample_and_virus_bowtie2_dedup_atcc_ref.tsv",
sep = "\t",
header = TRUE
)
depth_nodedup_gm <-
read.table(
"data_genome_medicine/genome_medicine_TE_sequencing_experiment_readdepth_per_sample_and_virus_bowtie2_nodedup_atcc_ref.tsv",
sep = "\t",
header = TRUE
)
depths_gm_all <- rbind(depth_dedup_gm, depth_nodedup_gm)
####genome coverage####
unzip(zipfile = "data_genome_medicine/genome_medicine_TE_sequencing_experiment_readdepth_per_site_sample_and_virus_bowtie2_dedup_atcc_ref.tsv.zip", exdir = "data_genome_medicine/")
dedup_per_site <-
read.table(
"data_genome_medicine/genome_medicine_TE_sequencing_experiment_readdepth_per_site_sample_and_virus_bowtie2_dedup_atcc_ref.tsv",
sep = "\t",
header = TRUE
)
file.remove("data_genome_medicine/genome_medicine_TE_sequencing_experiment_readdepth_per_site_sample_and_virus_bowtie2_dedup_atcc_ref.tsv")
[1] TRUE
#View(dedup_per_site)
counts_dedup_bt <-
read.table(
"data_TE/TE_sequencing_experiment_readcount_per_sample_and_virus_bowtie2_dedup_atcc_ref.tsv",
sep = "\t",
header = TRUE
)
#add length column from read depth file to per site file
lengths <-
counts_dedup_bt |>
select(virus,length) |>
distinct()
#calculate genome coverage
coverage <-
dedup_per_site |>
group_by(sample_id, virus, Viral.load, Replicate) |>
filter(coverage > 0) |>
summarise(genome_coverage = n()) |>
left_join(lengths) |>
mutate(percent_coverage = genome_coverage / length)
`summarise()` has grouped output by 'sample_id', 'virus', 'Viral.load'. You can override using the `.groups` argument.Joining with `by = join_by(virus)`
coverage |>
filter(Viral.load != 0) |>
ggplot(aes(x = virus, y = percent_coverage)) +
geom_boxplot(outlier.shape = NA) +
geom_point(position = position_dodge(width = .75)) +
facet_grid( ~ as.character(Viral.load)) +
theme_bw() +
theme(axis.title.y = element_text(size = 10)) +
ylab("Genome coverage") +
theme_few() +
theme(
axis.title.x = element_blank(),
axis.text.x = element_text(angle = 45, hjust = 1),
axis.title.y = element_text(size = 15),
axis.text = element_text(size = 12),
strip.text = element_text(size = 12),
legend.title = element_blank(),
legend.position = "top",
panel.grid.major = element_line(linewidth = 0.1, color = "gray60"),
panel.grid.minor = element_line(linewidth = 0.1, color = "lightgray")
) +
scale_y_continuous(limits = c(0, 1)) +
scale_x_discrete(
labels = c(
"Human adenovirus 40",
"Human betaherpesvirus",
"Human respiratory syncytial virus",
"Influenza B virus",
"Mammalian orthoreovirus 3",
"Zika virus"
)
)
#ggsave("/Users/laura/Dropbox/glasgow/github/te_ug_rodents/figures/genome_medicine/all_genome_coverage.png",width=14,height=6)
#ggsave("/Users/laura/Dropbox/glasgow/github/te_ug_rodents/figures/manuscript_figures_pdf/FigureS12.pdf",width=20,height=6)
#PDF
ggsave(
"figures/manuscript_figure_2025/PDF/Figure_S11.pdf",
width = 14,
height = 6
)
#PNG
ggsave(
"figures/manuscript_figure_2025/PNG/Figure_S11.png",
width = 14,
height = 6
)
ggsave("figures/manuscript_figure_2025/PDF/Figure_S12.pdf",
width=12,
height=10)
ggsave("figures/manuscript_figure_2025/PNG/Figure_S12.png",
width=12,
height=10)
kobu_depth <-
read.table(
"data_kobuvirus/kobuvirus_TE_polyomics_readdepth_20241007.tsv",
sep = "\t",
header = TRUE
) %>%
select(!expt) %>%
select(!Number.of.paired.end.reads..QT.)
cardio_depth <-
read.table(
"data_cardiovirus/cardiovirus_denovo_bowtie2_read_depth_per_sample_dedup.tsv",
sep = ",",
header = TRUE
)
cardio_depth_poly <-
read.table(
"data_cardiovirus/cardiovirus_denovo_bowtie2_readdepth_per_sample_dedup_polyomics.tsv",
sep = ",",
header = TRUE
) |>
select(-contig_name) |>
filter(Background %in% c("S18", "S19", "S43")) |>
select(-Background) |>
separate(Sample_id, into = c(NA, NA, "Background"), sep = "-", remove = FALSE)
depth_both <- rbind(kobu_depth, cardio_depth, cardio_depth_poly) |>
mutate(Viral.load = ifelse(is.na(Viral.load), 0, Viral.load))
depth_both$type <- depth_both$type |>
case_match("dedup_TE" ~ "dedup",
"dedup_polyomics" ~ "dedup",
.default = depth_both$type)
depth_both$Sample_id <- depth_both$Sample_id |>
case_match("RNA-Msp-p2" ~ "M1",
"RNA-Msp-p8" ~ "M2",
"RNA-Msp-p6" ~ "M3",
.default = depth_both$Sample_id
)
depth_both$virus <- depth_both$virus |>
case_match("cardiovirus" ~ "Cardiovirus",
"k97_19971" ~ "Kobuvirus",
.default = depth_both$virus
)
cols2 <- c("#BB5566", "#004488")
coverage_labels <- c(
"k97_19971" = "Kobuvirus",
"cardiovirus" = "Cardiovirus",
"p2" = "M1",
"p8" = "M2"
)
depth_both$Viral.load <- factor(depth_both$Viral.load, labels = c("Shotgun",
"10^{2}",
"10^{3}",
"10^{5}"
))
# depth_both |>
# filter(Background != "p6") |>
# filter(type == "dedup") |>
# filter(mapper == "bowtie2") |>
# filter(Viral.load != "NA") |>
# ggplot(aes(
# x = Viral.load,
# y = (mean_depth),
# colour = Background
# )) +
# geom_point() +
# theme_few() +
# theme(
# axis.title.x = element_text(size = 12),
# axis.title.y = element_text(size = 12),
# legend.title = element_blank(),
# legend.position = "none",
# panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
# panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
# ) +
# facet_grid(Background ~ virus, labeller = as_labeller(coverage_labels)) +
# scale_color_manual(values = cols2) +
# ylab("Mean read depth") +
# xlab("Spike-in viral load (gc/ml)") +
# scale_x_log10(labels = scales::trans_format("log10",
# scales::label_math())) +
# scale_y_log10(labels = scales::trans_format("log10",
# scales::label_math()))
# replotting S13 in similar way to S14
depth_both %>%
filter(Background != "p6") |>
filter(type == "dedup") |>
filter(mapper == "bowtie2") |>
# filter(Viral.load != "NA") |>
ggplot(aes(x = Sample_id, y = mean_depth, color = Background)) +
geom_point() +
ylab("Mean read depth") +
xlab("Sample ID") +
facet_grid(virus ~ Viral.load,
scales = "free_x",
label = label_parsed) +
scale_color_manual(values = cols2, labels = c("M1", "M2")) +
theme_few() +
theme(
axis.title.x = element_text(size = 12),
axis.title.y = element_text(size = 12),
legend.title = element_blank(),
legend.position = "right",
panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
)+
scale_y_log10(labels = scales::trans_format("log10",
scales::label_math()))
ggsave("figures/manuscript_figure_2025/PDF/Figure_S13_v2.pdf",
width=8,
height=4)
ggsave("figures/manuscript_figure_2025/PNG/Figure_S13_v2.png",
width=8,
height=4)
kobu_per_site <-
read.table(
"data_kobuvirus/kobuvirus_TE_polyomics_readdepth_persite_20241007.tsv",
sep = "\t",
header = TRUE
) %>%
select(virus,
seg,
site,
coverage,
n_sites,
Sample_id,
Background,
Viral.load,
mapper,
type)
kobu_per_site$Viral.load <-
kobu_per_site$Viral.load %>%
replace_na(., 0)
kobu_per_site$Sample_id <- kobu_per_site$Sample_id %>%
case_match("RNA-Msp-p2" ~ "M1",
"RNA-Msp-p8" ~ "M2",
.default = kobu_per_site$Sample_id)
kobu_per_site$Background <- kobu_per_site$Background %>%
case_match("p2" ~ "M1",
"p8" ~ "M2",
.default = kobu_per_site$Background)
cardio_per_site <-
read.table(
"data_cardiovirus/cardiovirus_denovo_bowtie2_read_depth_per_site_and_sample_dedup.tsv",
sep = ",",
header = TRUE
)
cardio_polyomics_per_site <-
read.table(
"data_cardiovirus/cardiovirus_denovo_bowtie2_readdepth_per_site_sample_dedup_polyomics.tsv",
sep = ",",
header = TRUE
) %>%
select(!contig_name)
#View(cardio_polyomics_per_site)
cardio_TE_polyomics <-
full_join(
cardio_per_site,
cardio_polyomics_per_site,
by = c(
"virus",
"seg",
"site",
"coverage",
"n_sites",
"Sample_id",
"Background",
"Viral.load",
"mapper",
"type"
)
)
cardio_TE_polyomics$Viral.load <- cardio_TE_polyomics$Viral.load %>%
replace_na(., 0)
drops <-
c(
"RNA-Msp-p1",
"RNA-Msp-p3",
"RNA-Msp-p4",
"RNA-Msp-p5",
"RNA-Msp-p6",
"RNA-Msp-p7",
"RNA-Msp-p9"
)
cardio_TE_polyomics <- cardio_TE_polyomics %>%
filter(!Sample_id %in% drops)
cardio_TE_polyomics$Sample_id <- cardio_TE_polyomics$Sample_id %>%
case_match("RNA-Msp-p2" ~ "M1",
"RNA-Msp-p8" ~ "M2",
.default = cardio_TE_polyomics$Sample_id)
cardio_TE_polyomics$Background <- cardio_TE_polyomics$Background %>%
case_match("S18" ~ "M1",
"S43" ~ "M2",
"p2" ~ "M1",
"p8" ~ "M2",
.default = cardio_TE_polyomics$Background)
persite_both <- rbind(kobu_per_site, cardio_TE_polyomics)
persite_both$type <- persite_both$type %>%
case_match("dedup_TE" ~ "dedup",
"dedup_polyomics" ~ "dedup",
.default = persite_both$type)
#TE data only - compare coverage between backgrounds/viral loads
cols2 <- c("#BB5566", "#004488")
coverage_labels <- c(
"100" = "10^{2}",
"1000" = "10^{3}",
"100000" = "10^{5}",
"0" = "Shotgun",
"Kobuvirus" = "Kobuvirus",
"Cardiovirus" = "Cardiovirus"
)
coverage_labels2 <- c(
"100" = "10^{2}",
"1000" = "10^{3}",
"100000" = "10^{5}",
"0" = "Shotgun",
"A" = "A",
"B" = "B",
"C" = "C",
"D" = "D",
"F" = "F",
"G" = "G",
"H" = "H",
"I" = "I",
"J" = "J",
"K" = "K",
"L" = "L",
"M" = "M",
"N" = "N",
"O" = "O",
"P" = "P",
"ME_P1" = "M1",
"ME_P2" = "M2"
)
kobu_coverage <- kobu_per_site %>%
filter(mapper == "bowtie2") %>%
filter(type == "dedup") %>%
filter(Background != "p6") %>%
group_by(Sample_id, Viral.load, Background) %>%
filter(coverage > 0) %>%
summarise(genome_coverage = n()) %>%
mutate(percent_coverage = genome_coverage / 8467) %>%
mutate(virus = "Kobuvirus")
`summarise()` has grouped output by 'Sample_id', 'Viral.load'. You can override using the `.groups` argument.
cardio_coverage <- cardio_TE_polyomics %>%
filter(Background != "p6") %>%
group_by(Sample_id, Viral.load, Background) %>%
filter(coverage > 0) %>%
summarise(genome_coverage = n()) %>%
mutate(percent_coverage = genome_coverage / 6945) %>%
mutate(virus = "Cardiovirus")
`summarise()` has grouped output by 'Sample_id', 'Viral.load'. You can override using the `.groups` argument.
coverage_both <- rbind(kobu_coverage, cardio_coverage)
coverage_both$Viral.load <- factor(coverage_both$Viral.load, labels = c("Shotgun",
"10^{2}",
"10^{3}",
"10^{5}"
))
coverage_both %>%
ggplot(aes(x = Sample_id, y = percent_coverage, color = Background)) +
geom_point() +
ylab("Genome coverage") +
xlab("Sample ID") +
facet_grid(virus ~ (Viral.load),
scales = "free_x",
label = label_parsed) +
scale_color_manual(values = cols2, labels = c("M1", "M2")) +
theme_few() +
theme(
axis.title.x = element_text(size = 12),
axis.title.y = element_text(size = 12),
legend.title = element_blank(),
legend.position = "right",
panel.grid.major = element_line(linewidth=0.1, color = "gray60"),
panel.grid.minor = element_line(linewidth=0.1, color = "lightgray")
) +
scale_y_continuous(limits = c(0, 1))
#PDF
ggsave("figures/manuscript_figure_2025/PDF/Figure_S14.pdf",
width = 8,
height = 4)
#PNG
ggsave("figures/manuscript_figure_2025/PNG/Figure_S14.png",
width = 8,
height = 4)
#PDF
ggsave("figures/manuscript_figure_2025/PDF/Figure_S15.pdf",
width=12,
height=7)
#PNG
ggsave("figures/manuscript_figure_2025/Png/Figure_S15.png",
width=12,
height=7)
#PDF
ggsave("figures/manuscript_figure_2025/PDF/Figure_S16.pdf",
width=12,
height=7)
#PNG
ggsave("figures/manuscript_figure_2025/PNG/Figure_S16.png",
width=12,
height=7)